
library(seqinr)
library(stringr)
library(dplyr)
library(vioplot)
library(gmodels)

file1 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File1.txt", header = TRUE, sep = "\t", dec = ".")
file1 <- file1[file1$Residue != "",]
file1$Scores <- as.numeric(word(file1$PTMscores,-1,sep=":"))
file2 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File2.txt", header = TRUE, sep = "\t", dec = ".")
file2 <- file2[file2$Residue != "",]
file2$Scores <- as.numeric(word(file2$PTMscores,-1,sep=":"))
file3 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File3.txt", header = TRUE, sep = "\t", dec = ".")
file3 <- file3[file3$Residue != "",]
file3$Scores <- as.numeric(word(file3$PTMscores,-1,sep=":"))
file4 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File4.txt", header = TRUE, sep = "\t", dec = ".")
file4 <- file4[file4$Residue != "",]
file4$Scores <- as.numeric(word(file4$PTMscores,-1,sep=":"))
file5 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File5.txt", header = TRUE, sep = "\t", dec = ".")
file5 <- file5[file5$Residue != "",]
file5$Scores <- as.numeric(word(file5$PTMscores,-1,sep=":"))
file6 <- read.delim("D:/Pipeline comparisons/Writing/Data/Musite/Rice/Prediction_results_File6.txt", header = TRUE, sep = "\t", dec = ".")
file6 <- file6[file6$Residue != "",]
file6$Scores <- as.numeric(word(file6$PTMscores,-1,sep=":"))


AllRice_Musite <- rbind.data.frame(file1,file2,file3,file4,file5,file6)

TPP_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Rice/TPP_GSB.csv")
MQ_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Rice/MQ_GSB.csv")
MQ_GSB$PROTEIN_LOC <- paste0(MQ_GSB$Single_Protein,"_",MQ_GSB$PROTEIN_POS_NUM)
PD_GSB <- read.csv("D:/Pipeline comparisons/Writing/Data/GSB/Rice/PD_GSB.csv")

AllRice_Musite$PROTEIN_LOC <- paste0(AllRice_Musite$ID,"_",AllRice_Musite$Position)

length(unique(AllRice_Musite$PROTEIN_LOC))

Musite_TPP <- merge(TPP_GSB,AllRice_Musite,by="PROTEIN_LOC",all.x = TRUE)
Musite_MQ <- merge(MQ_GSB,AllRice_Musite,by="PROTEIN_LOC",all.x = TRUE)
Musite_PD <- merge(PD_GSB,AllRice_Musite,by="PROTEIN_LOC",all.x = TRUE)

Musite_TPP$Pipeline <- "TPP"
Musite_MQ$Pipeline <- "MQ"
Musite_PD$Pipeline <- "PD"

R_Musite_TPP <- dplyr::select(Musite_TPP,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))
R_Musite_MQ <- dplyr::select(Musite_MQ,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))
R_Musite_PD <- dplyr::select(Musite_PD,c("PROTEIN_LOC","FLR_Adj_Score", "Pipeline", "Scores","cat", "Amino"))


All_Results_Musite <- rbind.data.frame(R_Musite_TPP,R_Musite_MQ,R_Musite_PD)
All_Results_Musite_NA <- All_Results_Musite[!is.na(All_Results_Musite$Scores),]


Musite_NA <- All_Results_Musite[is.na(All_Results_Musite$Scores),]

CrossTable(Musite_NA$Pipeline,Musite_NA$Amino)


pairwise.wilcox.test(All_Results_Musite_NA$Scores, All_Results_Musite_NA$Pipeline, p.adjust.method = "bonferroni",
                     paired = FALSE)

colors = c(rep("#FFE0B2",1),rep("#FFA726",1),rep("#F57C00",1))

par(cex.lab=1.5) 
par(cex.axis=1.5)
boxplot(Scores~Pipeline,data=All_Results_Musite,col=colors)



################ Including not chosen sites at peptidoform level ###################


PXD000923_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD000923A_pform.csv')
PXD002222_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD002222A_pform.csv')
PXD002756_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD002756A_pform.csv')
PXD004705_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD004705A_pform.csv')
PXD004939_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD004939A_pform.csv')
PXD005241_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD005241A_pform.csv')
PXD012764_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD012764A_pform.csv')
PXD019291_TPP <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/TPP/Rice/pform/TPP_PXD019291A_pform.csv')

PXD000923_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD000923A_pform.csv')
PXD002222_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD002222A_pform.csv')
PXD002756_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD002756A_pform.csv')
PXD004705_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD004705A_pform.csv')
PXD004939_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD004939A_pform.csv')
PXD005241_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD005241A_pform.csv')
PXD012764_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD012764A_pform.csv')
PXD019291_MQ <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/MQ/Rice/pform/MQ_PXD019291A_pform.csv')

PXD000923_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD000923A_pform.csv')
PXD002222_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD002222A_pform.csv')
PXD002756_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD002756A_pform.csv')
PXD004705_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD004705A_pform.csv')
PXD004939_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD004939A_pform.csv')
PXD005241_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD005241A_pform.csv')
PXD012764_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD012764A_pform.csv')
PXD019291_PD <- read.csv(file = 'D:/Pipeline comparisons/Writing/Data/PD/Rice/pform/PD_PXD019291A_pform.csv')

PXD000923_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD002222_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD002756_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD004705_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD004939_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD005241_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD012764_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))
PXD019291_TPP_Red <- dplyr::select(PXD000923_TPP, c("PROTEIN_LOC"))

PXD000923_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD002222_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD002756_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD004705_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD004939_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD005241_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD012764_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))
PXD019291_MQ_Red <- dplyr::select(PXD000923_MQ, c("PROTEIN_LOC", "Single_Protein","PROTEIN_POS_NUM"))

PXD000923_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD002222_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD002756_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD004705_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD004939_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD005241_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD012764_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))
PXD019291_PD_Red <- dplyr::select(PXD000923_PD, c("PROTEIN_LOC"))


All_TPP <- rbind.data.frame(PXD000923_TPP_Red,PXD002222_TPP_Red,PXD002756_TPP_Red,PXD004705_TPP_Red,PXD004939_TPP_Red,
                            PXD005241_TPP_Red,PXD012764_TPP_Red,PXD019291_TPP_Red)
All_MQ <- rbind.data.frame(PXD000923_MQ_Red,PXD002222_MQ_Red,PXD002756_MQ_Red,PXD004705_MQ_Red,PXD004939_MQ_Red,
                            PXD005241_MQ_Red,PXD012764_MQ_Red,PXD019291_MQ_Red)
All_PD <- rbind.data.frame(PXD000923_PD_Red,PXD002222_PD_Red,PXD002756_PD_Red,PXD004705_PD_Red,PXD004939_PD_Red,
                            PXD005241_PD_Red,PXD012764_PD_Red,PXD019291_PD_Red)

All_MQ$PROTEIN_LOC <- paste0(All_MQ$Single_Protein,"_",All_MQ$PROTEIN_POS_NUM)
All_MQ$Single_Protein <- NULL
All_MQ$PROTEIN_POS_NUM <- NULL

All_TPP$Pipeline <-"TPP"
All_MQ$Pipeline <-"MQ"
All_PD$Pipeline <-"PD"

All_TPP$cat <-"Not chosen"
All_MQ$cat <-"Not chosen"
All_PD$cat <-"Not chosen"

All_Data <- rbind.data.frame(All_TPP,All_MQ,All_PD)

All_Data_NC <- All_Data[! All_Data$PROTEIN_LOC %in% All_Results_Musite_NA$PROTEIN_LOC,]

All_Data_NC = All_Data_NC[!duplicated(All_Data_NC$PROTEIN_LOC),]

Musite_All_Data_NC <- merge(All_Data_NC,AllRice_Musite,by="PROTEIN_LOC",all.x = TRUE)

Musite_All_Data_NC <-Musite_All_Data_NC[!is.na(Musite_All_Data_NC$Scores),]


FinalSet <- rbind.data.frame(dplyr::select(All_Results_Musite_NA,c("Pipeline","cat","Scores")),dplyr::select(Musite_All_Data_NC,c("Pipeline","cat","Scores")))
par(cex.lab=1.2) 
par(cex.axis=1.2)
boxplot(Scores~cat*Pipeline,data=FinalSet,col=colors)

FinalSet$Group <- paste0(FinalSet$Pipeline,".",FinalSet$cat)

pairwise.wilcox.test(FinalSet$Scores, FinalSet$Group, p.adjust.method = "bonferroni",
                     paired = FALSE)



